# remove everything 
remove(list = ls())
#set.seed(1907)
# libraries
library(dplyr)
library(texreg)
library(stats)
library(MASS)
library(ROCR)
library(survival)
library(gdata)

# Data prep ####

# import the data
d <- read.csv("data.csv")

## factor variables ####

# Incomp
d$Incompatibility <- as.factor(d$Incompatibility)
d$Intensity <- as.factor(d$Intensity)
d$Region <- as.factor(d$Region)

# Data: Select Variables ####
d <- d %>% dplyr::select(SideA, SideB, DyadId, GWNoA, Year, 
                          Negotiation, demref, polyarchy, duration_dy_ln, imp.gdppcln, imp.popln,
                          Intensity,
                          Region, RebStrBin.imp, Ethnic.incomp, Incompatibility, 
                          medprev2y,
                          ExChange, oda.pc.ln, UNPKO.total.ln, BdBest_ln, tsln, tsln2, tsln3)

# Main Models ####
#model 1 
fm1 <- as.formula (Negotiation ~ polyarchy + demref + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                     Region + tsln + tsln2 + tsln3)
#model 2
fm2 <- as.formula (Negotiation ~ polyarchy + demref + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                     Region + RebStrBin.imp + Ethnic.incomp + ExChange + oda.pc.ln + medprev2y +
                     tsln + tsln2 + tsln3 )

#model 3
fm3 <- as.formula (Negotiation ~ polyarchy + demref + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                     Region + RebStrBin.imp + Ethnic.incomp + ExChange + oda.pc.ln + medprev2y +
                     tsln + tsln2 + tsln3 + BdBest_ln + UNPKO.total.ln)


### main model regressions
m1 <- glm(formula = fm1, data = d, family = binomial(link = "logit"))
m2 <- glm(formula = fm2, data = d, family = binomial(link = "logit"))
m3 <- glm(formula = fm3, data = d, family = binomial(link = "logit"))

screenreg(list(m1, m2, m3))

# After ROCR update, it does not like NA's anymore
d1 <- m1$model
d2 <- m2$model
d3 <- m3$model

# Reduced model formulas ####
#model 1 
fm1_r <- as.formula (Negotiation ~ polyarchy + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                      Region + tsln + tsln2 + tsln3)
#model 2
fm2_r <- as.formula (Negotiation ~ polyarchy + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                      Region + RebStrBin.imp + Ethnic.incomp + ExChange + oda.pc.ln + medprev2y +
                      tsln + tsln2 + tsln3 )

#model 3
fm3_r <- as.formula (Negotiation ~ polyarchy + duration_dy_ln + imp.gdppcln + imp.popln + Intensity +
                      Region + RebStrBin.imp + Ethnic.incomp + ExChange + oda.pc.ln + medprev2y +
                      tsln + tsln2 + tsln3 + BdBest_ln + UNPKO.total.ln)
### reduced model regressions
m1_r <- glm(formula = fm1_r, data = d1, family = binomial(link = "logit"))
m2_r <- glm(formula = fm2_r, data = d2, family = binomial(link = "logit"))
m3_r <- glm(formula = fm3_r, data = d3, family = binomial(link = "logit"))

screenreg(list(m1_r, m2_r, m3_r))

# Four-fold cross validation ####

## How many runs####
nrun <- 10 # make it 10k (takes time)

# catcher for models 1-3
m1_auc_f <- rep(NA, nrun)
m1_auc_r <- rep(NA, nrun)
m2_auc_f <- rep(NA, nrun)
m2_auc_r <- rep(NA, nrun)
m3_auc_f <- rep(NA, nrun)
m3_auc_r <- rep(NA, nrun)


# loop for cross-validation 
for (k in 1:nrun){ #loop over nrun
  
  # assign each observation to a group
  g1 <- sample(1:4, nrow(d1), replace = T)
  g2 <- sample(1:4, nrow(d2), replace = T)
  g3 <- sample(1:4, nrow(d3), replace = T)
  
  d1$g <- g1 
  d2$g <- g2 
  d3$g <- g3 
  
  # step1 is in-sample estimation
  d1$s1_m1f <- NA # m1 full
  d1$s1_m1r <- NA # m1 reduced
  
  d2$s1_m2f <- NA # m2 full
  d2$s1_m2r <- NA # m2 reduced
  
  d3$s1_m3f <- NA # m3 full
  d3$s1_m3r <- NA # m3 reduced
  
  # step2 is out-of-sample estimation 
  d1$s2_m1f <- NA # m1 full
  d1$s2_m1r <- NA # m1 reduced
  
  d2$s2_m2f <- NA # m2 full
  d2$s2_m2r <- NA # m2 reduced
  
  d3$s2_m3f <- NA # m3 full
  d3$s2_m3r <- NA # m3 reduced
  
  
  # regress over each quarter with full
  for (i in 1:4) {
    
    # remove one quarter and regres for each three specifications (full)
    est_m1f <- glm(formula = fm1, data = d1[d1$g != i, ], family = binomial(link = "logit"))
    est_m2f <- glm(formula = fm2, data = d2[d2$g != i, ], family = binomial(link = "logit"))
    est_m3f <- glm(formula = fm3, data = d3[d3$g != i, ], family = binomial(link = "logit"))
    
    # predict over full sample 
    d1$s1_m1f <- predict(est_m1f, newdata = d1, type = "response")
    d2$s1_m2f <- predict(est_m2f, newdata = d2, type = "response")
    d3$s1_m3f <- predict(est_m3f, newdata = d3, type = "response")
    
    # out-of-sample part
    d1$s2_m1f[d1$g == i] <- d1$s1_m1f[d1$g == i]
    d2$s2_m2f[d2$g == i] <- d2$s1_m2f[d2$g == i]
    d3$s2_m3f[d3$g == i] <- d3$s1_m3f[d3$g == i]
  }
  
  # predicted probability (using ROCR package)
  pr_m1f         <- prediction(d1$s2_m1f, d1$Negotiation) # model1 prediction
  pr_m2f         <- prediction(d2$s2_m2f, d2$Negotiation) # model2 prediction 
  pr_m3f         <- prediction(d3$s2_m3f, d3$Negotiation) # model3 prediction
  
  # see the auc
  auc_m1f        <- performance(pr_m1f, measure = "auc") # mode1 auc
  auc_m2f        <- performance(pr_m2f, measure = "auc") # mode2 auc
  auc_m3f        <- performance(pr_m3f, measure = "auc") # mode3 auc
  
  # extract aucs out of the loop
  m1_auc_f[k]  <- auc_m1f@y.values[[1]]
  m2_auc_f[k]  <- auc_m2f@y.values[[1]]
  m3_auc_f[k]  <- auc_m3f@y.values[[1]]
  
  ## now reduced
  for (i in 1:4) {
    # remove one quarter and regres 
    est_m1r <- glm(formula = fm1_r, data = d1[d1$g != i, ], family = binomial(link = "logit"))
    est_m2r <- glm(formula = fm2_r, data = d2[d2$g != i, ], family = binomial(link = "logit"))
    est_m3r <- glm(formula = fm3_r, data = d3[d3$g != i, ], family = binomial(link = "logit"))
    
    # predict over full sample 
    d1$s1_m1r <- predict(est_m1r, newdata = d1, type = "response")
    d2$s1_m2r <- predict(est_m2r, newdata = d2, type = "response")
    d3$s1_m3r <- predict(est_m3r, newdata = d3, type = "response")
    
    # out-of-sample part
    d1$s2_m1r[d1$g == i] <- d1$s1_m1r[d1$g == i]
    d2$s2_m2r[d2$g == i] <- d2$s1_m2r[d2$g == i]
    d3$s2_m3r[d3$g == i] <- d3$s1_m3r[d3$g == i]
  }
  
  pr_m1r        <- prediction(d1$s2_m1r, d1$Negotiation)
  pr_m2r        <- prediction(d2$s2_m2r, d2$Negotiation)
  pr_m3r        <- prediction(d3$s2_m3r, d3$Negotiation)
  
  
  auc_m1r       <- performance(pr_m1r, measure = "auc")
  auc_m2r       <- performance(pr_m2r, measure = "auc")
  auc_m3r       <- performance(pr_m3r, measure = "auc")
  
  m1_auc_r[k] <- auc_m1r@y.values[[1]]
  m2_auc_r[k] <- auc_m2r@y.values[[1]]
  m3_auc_r[k] <- auc_m3r@y.values[[1]]
  
}

# BOXPLOT AUC #### 
dev.off()
png("auc.png", units = "cm", 
    width = 15, height = 12, res = 300)
par(mar = c(3,3,1,1), mgp = c(1.5,0.5,0))
boxplot(m1_auc_r, m1_auc_f, m2_auc_r, m2_auc_f, m3_auc_r, m3_auc_f, 
        ylab = "AUC", ylim = c(0.805, 0.858), pch = 16, axes = T, cex = 0.4, cex.axis = 0.8) 
par(mgp = c(1.5,1,0))
axis(1, at = 1:6, labels = c("Model 1\nExcluded", "Model 1\nIncluded",
                             "Model 2\nExcluded", "Model 2\nIncluded",
                             "Model 3\nExcluded", "Model 3\nIncluded"), lwd = 0, cex.axis = 0.75)
dev.off()


# In Sample ####

# predicted probability for full model
d$p_m1f <- predict(m1, newdata = d, type = "response")
d$p_m2f <- predict(m2, newdata = d, type = "response")
d$p_m3f <- predict(m3, newdata = d, type = "response")

# reduced model regressions
m1_r <- glm(formula = fm1_r, data = d, family = binomial(link = "logit"))
m2_r <- glm(formula = fm2_r, data = d, family = binomial(link = "logit"))
m3_r <- glm(formula = fm3_r, data = d, family = binomial(link = "logit"))

# predicted probability for reduced model
d$p_m1r <- predict(m1_r, newdata = d, type = "response")
d$p_m2r <- predict(m2_r, newdata = d, type = "response")
d$p_m3r <- predict(m3_r, newdata = d, type = "response")

x <- d %>% dplyr::select(SideA, SideB, Year, Negotiation, demref,
                         p_m1f, p_m1r,
                         p_m2f, p_m2r,
                         p_m3f, p_m3r)

# Separation plot ####
# color 
#x <- x[x$demref == 1, ]
x$col <- NA
x$col[x$Negotiation == 1 & x$demref == 1] <- "red" 
x$col[x$Negotiation == 1 & x$demref == 0] <- "red"

x$col[x$Negotiation == 0 & x$demref == 1] <- "black"
x$col[x$Negotiation == 0 & x$demref == 0] <- "black" 


cx <- 1
png("separation_plot.png", units = "cm", 
    width = 16, height = 10, res = 300)
par(mar = c(3,4,1,1), mgp = c(1.5,0.5,0))
plot(seq(0, 1, length.out = 8), 1:8, t = "n", ylim = c(0.5,8.5), axes = F, ylab = "", xlab = "Probability",
     cex.axis = 0.75)
points(sort(x$p_m1r), rep(1, length(sort(x$p_m1r))), pch = "|", cex = cx, col = x$col)
points(sort(x$p_m1f), rep(2, length(sort(x$p_m1f))), pch = "|", cex = cx, col = x$col)

points(sort(x$p_m2r), rep(4, length(sort(x$p_m2r))), pch = "|", cex = cx, col = x$col)
points(sort(x$p_m2f), rep(5, length(sort(x$p_m2f))), pch = "|", cex = cx, col = x$col)

points(sort(x$p_m3r), rep(7, length(sort(x$p_m3r))), pch = "|", cex = cx, col = x$col)
points(sort(x$p_m3f), rep(8, length(sort(x$p_m3f))), pch = "|", cex = cx, col = x$col)

axis(1, seq(0,1,0.1), cex.axis = 0.75)
par(mgp = c(0,0,0))
axis(2, at = c(1,2,4,5,7,8), cex.axis = 0.75, las = 1, lwd = 0,
     labels = c("Model 1\nExcluded", "Model 1\nIncluded",
                "Model 2\nExcluded", "Model 2\nIncluded", 
                "Model 3\nExcluded", "Model 3\nIncluded")) 
dev.off()